library(readr)
bc = read_delim("r1.withq.barcodes", delim=" ")
library(dplyr)
grouped = bc %>% group_by(barcode, index) %>% summarise(count=n(), bcq=mean(barcodeq),
iq=mean(indexq))
There are 65778 unique barcode-index pairs and of those 5838 are seen more than 10 times.
We can see below that there is are a small set of barcode-index pairs that are represented a high number of times.
library(ggplot2)
ggplot(grouped, aes(count)) + geom_histogram() + theme_bw() +
scale_x_log10() + scale_y_sqrt()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Now we’ll mark up the barcode-index pair with whether or not these are some of the known sequences. The indexes we found correspond to the reverse complement column in the index file and the barcodes correspond to the sense column in the barcode file.
kbc = read_csv("metadata/BC sequences.csv")
kin = read_csv("metadata/indexseqs.csv")
grouped$kin = grouped$index %in% kin$rc
grouped$kbc = grouped$barcode %in% kbc$sense
grouped$known = grouped$kin & grouped$kbc
grouped$known = ifelse(grouped$known, "known", "unknown")
expected = read_csv("metadata/expected-pairs.csv")
colnames(expected) = c("ebarcode", "eindex")
expected$pair = paste(expected$ebarcode, expected$eindex, sep=":")
grouped$pair = paste(grouped$barcode, grouped$index, sep=":")
grouped$expected = grouped$pair %in% expected$pair
known = subset(grouped, known == "known")
I wrote out the barcode-index pairs with counts, the mean of the barcode quality (bcq), the mean of the index quality (iq), whether or not the index is a known index (kin), whether or not the barcode is a known barcode (kin), and whether or not the barcode-index pair is expected (known).
This table is here All barcode index
write.table(grouped, file="all-barcode-index-unfiltered.csv", col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
Now we can see if we mark up the barcode-index pairs with whether or not they are from the known barcode and known index files, we find that there are only 940 barcode-index pairs.
If we plot the histograms separating out whether they are a known or unknown barcode-index pair, we can see that the known barcode-index pairs have many more reads associated with them than the unknown barcode-index pairs.
library(ggplot2)
ggplot(grouped, aes(count)) + geom_histogram() + theme_bw() +
facet_wrap(~known) +
scale_x_log10() + scale_y_sqrt() +
ylab("number of barcode-index pairs")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We can see that the PHRED quality of the known barcode-index pairs has a tight distribution:
ggplot(grouped, aes(bcq, iq)) + geom_point() + theme_bw() +
facet_wrap(~known) +
ylab("mean PHRED quality of index") +
xlab("mean PHRED quality of barcode")
Here you can see that if we filter for only known barcodes and indexes, they tend to have both a high index and barcode quality compared to unknown sequences.
ggplot(grouped, aes(bcq, iq, color=known)) + geom_point(size=0.5) + theme_bw() +
facet_wrap(~kin+kbc) +
ylab("mean PHRED quality of index") +
xlab("mean PHRED quality of barcode")
If we look at the expected barcode-index pairings, we can see that there is no difference in the distribution of quality of the barcode-index pairs for the pairs we expect (TRUE) and the pairs we do not expect (FALSE). Filtering on quality will not help these results.
ggplot(subset(grouped, known == "known"), aes(bcq, iq)) +
geom_point(size=0.5) + theme_bw() +
facet_wrap(~expected) +
ylab("mean PHRED quality of index") +
xlab("mean PHRED quality of barcode")
We can see that the barcode-index pairs have about an order of magnitude less counts when they are not expected than when they are expected.
ggplot(subset(grouped, known == "known"), aes(count)) +
geom_histogram() + theme_bw() +
facet_wrap(~expected) +
scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
We’re expecting 2 to 3 barcodes per index, but we generally see more than that if we’re seeing 940 barcodes. How many per barcode?
barcode_per_index = known %>% group_by(index) %>% summarise(nbarcodes = n())
index_per_barcode = known %>% group_by(barcode) %>% summarise(nindex = n())
ggplot(barcode_per_index, aes(index, nbarcodes)) +
geom_bar(stat='identity') +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("")
ggplot(index_per_barcode, aes(barcode, nindex)) + geom_bar(stat='identity') +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("")
We see every index with every barcode at least once. Are some overrepresented?
ggplot(known, aes(barcode, count)) + facet_wrap(~index) + geom_bar(stat='identity') +
scale_y_sqrt() +
theme_bw() +
theme(axis.ticks = element_blank(), axis.text.x = element_blank())
For some indexes it looks like yes, and for others no.
ggplot(known, aes(barcode, count, fill=iq)) + facet_wrap(~index) +
geom_bar(stat='identity') +
scale_y_sqrt() +
theme_bw() +
theme(axis.ticks = element_blank(), axis.text.x = element_blank()) +
guides(fill=guide_legend(title="Index quality"))
ggplot(known, aes(barcode, count, fill=bcq)) + facet_wrap(~index) +
geom_bar(stat='identity') +
scale_y_sqrt() +
theme_bw() +
theme(axis.ticks = element_blank(), axis.text.x = element_blank()) +
guides(fill=guide_legend(title="Barcode quality"))
ggplot(known, aes(iq, bcq)) + facet_wrap(~index) +
geom_point() + xlab("index quality") +
ylab("barcode quality") + theme_bw()
Here I filtered by keeping only the barcodes for each index which were at least twice as prevalent as the average of the barcodes for that index. Then I filtered out all indexes that had more than 3 barcodes that passed that filter.
filtered = known %>% group_by(index) %>%
mutate(mcount = mean(count)) %>%
filter(count > 2*mcount) %>% group_by(index) %>%
mutate(nfiltered = n()) %>%
filter(nfiltered < 4)
That leaves us with these:
ggplot(filtered, aes(barcode, count)) + facet_wrap(~index) +
geom_bar(stat='identity') + scale_y_sqrt() +
theme_bw() +
theme(axis.ticks = element_blank(), axis.text.x = element_blank())
expected = read_csv("metadata/expected-pairs.csv")
colnames(expected) = c("ebarcode", "eindex")
expected$pair = paste(expected$ebarcode, expected$eindex, sep=":")
filtered$pair = paste(filtered$barcode, filtered$index, sep=":")
filtered$expected = filtered$pair %in% expected$pair
ggplot(filtered, aes(barcode, count, fill=expected)) + facet_wrap(~index) +
geom_bar(stat='identity') + scale_y_sqrt() +
theme_bw() +
theme(axis.ticks = element_blank(), axis.text.x = element_blank())
output = filtered %>% select(barcode, index, count)
write.table(output, file="barcode-index.csv", col.names=TRUE,
sep=",", quote=FALSE, row.names=FALSE)
known$pair = paste(known$barcode, known$index, sep=":")
known$expected = known$pair %in% expected$pair
ggplot(known, aes(barcode, count, fill=expected)) +
facet_wrap(~index) + geom_bar(stat='identity') +
scale_y_sqrt() +
theme_bw() +
theme(axis.ticks = element_blank(), axis.text.x = element_blank())
index_mixes = c("ATAACGGT", "TCAGCATT", "TGTGACTA")
ggplot(subset(known, index %in% index_mixes),
aes(barcode, count, fill=expected)) +
facet_wrap(~index) + geom_bar(stat='identity') +
scale_y_sqrt() +
theme_bw() +
theme(axis.ticks = element_blank(), axis.text.x = element_blank())
Coloring by barcode and index quality:
ggplot(subset(known, index %in% index_mixes),
aes(barcode, count, color=iq)) +
facet_wrap(~index+expected) + geom_point() +
scale_y_sqrt() +
theme_bw() +
theme(axis.ticks = element_blank(), axis.text.x = element_blank())
ggplot(subset(known, index %in% index_mixes),
aes(barcode, count, color=bcq)) +
facet_wrap(~index+expected) + geom_point() +
scale_y_sqrt() +
theme_bw() +
theme(axis.ticks = element_blank(), axis.text.x = element_blank())
Filtering out by barcode quality might help with some but doesn’t weed out all of the false positives.
output = filtered %>% select(barcode, index, count, expected)
write.table(output, file="filtered-barcode-index.csv", col.names=TRUE,
sep=",", quote=FALSE, row.names=FALSE)
output = known %>% select(barcode, index, count, expected)
write.table(output, file="unfiltered-barcode-index.csv", col.names=TRUE,
sep=",", quote=FALSE, row.names=FALSE)
I included an example of a CSV file of the expected pairs that is much easier to work with: